In [1]:
# Import packages
import pandas as pd
import numpy as np
import datetime as dt
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import matplotlib.pyplot as plt
import networkx as nx

import plotly.express as px
import plotly.graph_objects as go
In [2]:
#data = pd.read_csv('C:/Users/migue/Desktop/data_preprocess.csv')
#data['TID']=data['Point-of-Sale_ID'].astype(str)+data['ProductFamily_ID'].astype(str)+data['Date'].astype(str)
#data['Year'] = data['Date'].astype(str).str[:4]
#data['Year']=data['Year'].astype(int)
#df = data[['ProductName_ID','Point-of-Sale_ID','Year','Quarter','TID']].copy()
#df.to_csv('C:/Users/migue/Desktop/mba_df.csv',index=False)


#"C:/Users/Pedro/Desktop/Business Cases/BC5/Datasets/mba_dash.csv"
df = pd.read_csv('C:/Users/migue/Desktop/Datasets/mba_df.csv')
In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91171152 entries, 0 to 91171151
Data columns (total 5 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   ProductName_ID    int64 
 1   Point-of-Sale_ID  int64 
 2   Year              int64 
 3   Quarter           int64 
 4   TID               object
dtypes: int64(4), object(1)
memory usage: 3.4+ GB
In [4]:
df.head()
Out[4]:
ProductName_ID Point-of-Sale_ID Year Quarter TID
0 649 1 2017 1 1162017-03-04
1 649 1 2016 2 1162016-05-02
2 649 1 2016 4 1162016-10-24
3 649 1 2017 4 1162017-10-13
4 649 1 2017 4 1162017-10-14
In [5]:
# Creating DF for Dashboard Visualization
#mba_dash = df[(df['Year']==2016)&(df['Quarter'].isin([1,2,3]))&(df['Point-of-Sale_ID'].isin([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]))]
#mba_dash.info()
#mba_dash.to_csv('C:/Users/migue/Desktop/mba_dash.csv',index=False)
In [6]:
orders = df[(df['Year']==2016)&(df['Quarter']==3)&(df['Point-of-Sale_ID']==3)]
In [7]:
orders.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 17806 entries, 370438 to 645177
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   ProductName_ID    17806 non-null  int64 
 1   Point-of-Sale_ID  17806 non-null  int64 
 2   Year              17806 non-null  int64 
 3   Quarter           17806 non-null  int64 
 4   TID               17806 non-null  object
dtypes: int64(4), object(1)
memory usage: 834.7+ KB
In [8]:
pt = pd.pivot_table(orders[['TID','ProductName_ID']], index='TID', columns='ProductName_ID', 
                    aggfunc=lambda x: 1 if len(x)>0 else 0).fillna(0)
pt.head()
Out[8]:
ProductName_ID 6 14 15 17 19 21 24 27 38 39 ... 2817 2834 2838 2839 2841 2845 2846 2847 2848 2850
TID
3102016-07-01 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3102016-07-02 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3102016-07-04 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3102016-07-05 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3102016-07-06 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 737 columns

In [9]:
pt.info()
<class 'pandas.core.frame.DataFrame'>
Index: 1432 entries, 3102016-07-01 to 392016-09-30
Columns: 737 entries, 6 to 2850
dtypes: float64(737)
memory usage: 8.1+ MB
In [10]:
frequent_itemsets = apriori(pt, min_support=0.05, use_colnames=True)
frequent_itemsets
Out[10]:
support itemsets
0 0.054469 (21)
1 0.055866 (78)
2 0.058659 (198)
3 0.059358 (200)
4 0.059358 (216)
... ... ...
749 0.050978 (1408, 198, 2379, 2412, 567, 2399)
750 0.050279 (1408, 2379, 2412, 912, 567, 216)
751 0.050978 (1408, 2379, 2412, 567, 216, 2399)
752 0.050279 (1408, 198, 2379, 2412, 912, 567, 216)
753 0.050978 (1408, 198, 2379, 2412, 567, 216, 2399)

754 rows × 2 columns

In [11]:
frequent_itemsets.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 754 entries, 0 to 753
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   support   754 non-null    float64
 1   itemsets  754 non-null    object 
dtypes: float64(1), object(1)
memory usage: 11.9+ KB
In [12]:
# Generate the association rules - by lift
rulesLift = association_rules(frequent_itemsets, metric="lift", min_threshold=0)
rulesLift.sort_values(by='confidence', ascending=False, inplace=True)
rulesLift
Out[12]:
antecedents consequents antecedent support consequent support support confidence lift leverage conviction
6303 (1408, 912, 198, 2399) (216) 0.050279 0.059358 0.050279 1.0 16.847059 0.047295 inf
6496 (216, 2378, 2412) (1408, 198) 0.050978 0.053771 0.050978 1.0 18.597403 0.048237 inf
2032 (200, 2412, 198) (216) 0.051676 0.059358 0.051676 1.0 16.847059 0.048609 inf
6504 (2378, 2412) (1408, 198, 216) 0.050978 0.053771 0.050978 1.0 18.597403 0.048237 inf
658 (2378, 2412) (198) 0.050978 0.058659 0.050978 1.0 17.047619 0.047987 inf
... ... ... ... ... ... ... ... ... ...
2141 (2379) (200, 1408, 198) 0.062849 0.050978 0.050279 0.8 15.693151 0.047075 4.745112
4074 (2379) (912, 2378, 216) 0.062849 0.050978 0.050279 0.8 15.693151 0.047075 4.745112
6718 (2379) (219, 2412, 198, 567) 0.062849 0.050978 0.050279 0.8 15.693151 0.047075 4.745112
1082 (2379) (912, 219) 0.062849 0.050978 0.050279 0.8 15.693151 0.047075 4.745112
7557 (2379) (216, 219, 2412, 567) 0.062849 0.050978 0.050279 0.8 15.693151 0.047075 4.745112

10422 rows × 9 columns

In [13]:
data_scatter = dict(type='scatter',
    y=rulesLift['confidence'],
    x=rulesLift['lift'],
    #text=rulesLift.index,
    #mode='markers',
    #marker=dict(
    #size=rulesLift['support'],
    hovertemplate=#'Grand Prix: ' + df_racetracks["name_x"] + '<br>'
                    #'RuleID: ' + rulesLift.index + '<br>'+
                    'Lift: ' + rulesLift["lift"].astype(str) + '<br>'+
                    'Confidence: ' + rulesLift['confidence'].astype(str) + '<br>'+
                    'Support: ' + rulesLift['support'].astype(str) + '<br>'+    
                    'Antecedents: ' + rulesLift['antecedents'].astype(str) + '<br>'+
                    'Consequents: ' + rulesLift['consequents'].astype(str) + '<br>'
    '<extra></extra>',
    #color=scatterdf['Avg Salary'],  # set color equal to a variable
    #color_continuous_scale='mint',  # one of plotly colorscales
    #showscale=False,
    mode='markers',
    marker=dict(size=8,
                #size=rulesLift['support'],
                color=rulesLift['support'],
                colorscale='oranges',
                showscale=True,
                line_width=2),
    )

layout = dict(
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    xaxis_title="Lift",
    yaxis_title="Confidence"
)


fig = go.Figure(data=data_scatter,layout=layout)
fig.show()
In [ ]:
 
In [ ]: